Linear Regression Modelling for Soccer Data

Load packages and set directories

library(boot)
library(tidyverse)
library(broom)
setwd('~/Library/Mobile Documents/com~apple~CloudDocs/Udveksling/Studie/LS 88 - 3/Project/ls88_project-master')

Load data

df = read_csv('dataframe.csv')
Missing column names filled in: 'X1' [1]Parsed with column specification:
cols(
  .default = col_integer(),
  season = col_character(),
  date = col_datetime(format = ""),
  home_team_name = col_character(),
  away_team_name = col_character(),
  home_poss = col_double(),
  away_poss = col_double()
)
See spec(...) for full column specifications.

Retain only non-nominal features

df <- df %>% 
      select(-X1, -id, -country_id, -league_id, -stage, -date, -match_api_id, -home_team_api_id,
            -away_team_api_id, -home_team_name, -away_team_name, -season)

Add Goal Difference as target variable and print distribution

df <- df %>%
      mutate(goal_diff = home_team_goal - away_team_goal)
hist(df$goal_diff)

Looks perfectly normaly distributed.

As the target variable is derived from home_team_goal and away_team_goal we drop these variables.

df <- df %>% 
      select(-home_team_goal, -away_team_goal)

Based on the correlation matrix from the Jupyter Notebook, we will drop variables: home_points away_points

As these 2 variables have too much correlation with the rest of the variables and are interdependent.

df <- df %>% 
      select(-home_points, -away_points)

We scale the variables without centering to not have certain variables dominate too much

df_scaled <- as.data.frame(scale(df))

Fitting a linear model (iteration 1) We have: Response variable: Goal Difference Explanatory variables: All Others + Intercept

model1 <- lm(goal_diff ~ .,
              data = df)
#Print Summary
print(summary(model1))

Call:
lm(formula = goal_diff ~ ., data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.7374 -1.0369 -0.0032  0.9892  7.7300 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
(Intercept)  -1.0339389  0.7745341  -1.335 0.182004    
home_y1      -0.2117143  0.0285360  -7.419 1.52e-13 ***
home_y2      -0.8705695  0.2043061  -4.261 2.10e-05 ***
home_r       -1.0733716  0.1484183  -7.232 6.00e-13 ***
away_y1       0.0132646  0.0263142   0.504 0.614239    
away_y2       0.5317117  0.1447930   3.672 0.000245 ***
away_r        0.9035694  0.1321295   6.839 9.64e-12 ***
home_fouls    0.0143885  0.0093812   1.534 0.125194    
away_fouls   -0.0183639  0.0091475  -2.008 0.044783 *  
home_poss     0.0507326  0.0077174   6.574 5.76e-11 ***
away_poss    -0.0096464  0.0077888  -1.239 0.215624    
home_shoton  -0.0031848  0.0100736  -0.316 0.751905    
away_shoton   0.0001503  0.0119034   0.013 0.989927    
home_shotoff -0.0275419  0.0110350  -2.496 0.012618 *  
away_shotoff -0.0120150  0.0126634  -0.949 0.342796    
home_corners -0.0270724  0.0114862  -2.357 0.018490 *  
away_corners  0.0036774  0.0131484   0.280 0.779742    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.663 on 3023 degrees of freedom
Multiple R-squared:  0.141, Adjusted R-squared:  0.1365 
F-statistic: 31.02 on 16 and 3023 DF,  p-value: < 2.2e-16

Plot Model1

#Plot residuals
plot(model1$residuals)

Residuals look great. (centered around 0 with SD approximate to 1).

The intercept is insignificant. Will be removed. Model 2 Removing intercept.

model2 <- lm(goal_diff ~ . -1,
              data = df)
#Plot residuals
plot(model2$residuals)

#Print Summary
print(summary(model2))

Call:
lm(formula = goal_diff ~ . - 1, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.7571 -1.0326 -0.0055  0.9821  7.7238 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
home_y1      -0.2112893  0.0285379  -7.404 1.71e-13 ***
home_y2      -0.8668292  0.2043133  -4.243 2.28e-05 ***
home_r       -1.0731757  0.1484374  -7.230 6.10e-13 ***
away_y1       0.0124134  0.0263099   0.472 0.637092    
away_y2       0.5356928  0.1447810   3.700 0.000219 ***
away_r        0.9033008  0.1321464   6.836 9.84e-12 ***
home_fouls    0.0131246  0.0093345   1.406 0.159820    
away_fouls   -0.0194850  0.0091101  -2.139 0.032528 *  
home_poss     0.0411316  0.0027986  14.697  < 2e-16 ***
away_poss    -0.0193323  0.0028321  -6.826 1.05e-11 ***
home_shoton  -0.0041805  0.0100472  -0.416 0.677375    
away_shoton  -0.0006627  0.0118894  -0.056 0.955556    
home_shotoff -0.0291513  0.0109704  -2.657 0.007919 ** 
away_shotoff -0.0142992  0.0125488  -1.139 0.254593    
home_corners -0.0280717  0.0114633  -2.449 0.014388 *  
away_corners  0.0030107  0.0131407   0.229 0.818795    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.663 on 3024 degrees of freedom
Multiple R-squared:  0.1798,    Adjusted R-squared:  0.1754 
F-statistic: 41.43 on 16 and 3024 DF,  p-value: < 2.2e-16

Notice how home_shotoff is significant with a negative effect on the goal difference (teams who miss a lot of shots at home will achieve a worse goal difference). It is also noticeable how getting a lot of corners on your home turf decreases your goal difference. Surprisingly, shots on goal do not seem to have a significant effect.

BOOTSTRAPPING

Non-parametric bootstrapping for the coeffiecents with 5000 models. We do not know the true distribution and use nonparametric bootstrap sampling with replacement accordingly. This will use the empirical distribution.

###Code copied from statmethods.net
#Function to obatin regression weights
bs <- function(formula, data, indices){
  d <- data[indices, ] #allows boot to select sample
  fit <- lm(formula, data = d)
  return(coef(fit))
}
#Creating own boostrap method
bootstrap <- function(data, formula, k){
  return.matrix <- matrix(nrow = k, ncol = 16)
  
  set.seed(200)
  for(i in 1:k){
    sample_temp <- sample_n(data, size = nrow(data), replace = TRUE)
    model_temp <- lm(sample_temp,
                   formula = formula)
    return.matrix[i, ] <- coefficients(model_temp)
  }
  return(return.matrix)
}
#results_own <- bootstrap(df_scaled, goal_diff ~. - 1, 5000)
#Bootstrapping with 5000 replications
results <- boot(df, 
                statistic = bs,
                R = 5000,
                formula = goal_diff ~ . - 1)

Plot distributions of parameters

for(i in 1:length(results$t0)){
  hist(results$t[,i], 
       prob = TRUE,
       main = colnames(df)[i])
  abline(v = coefficients(model2)[i], col = 'red')#
}

coefficients(model2) - apply(results$t, 2, mean)
      home_y1       home_y2        home_r       away_y1       away_y2        away_r    home_fouls 
 2.340095e-04  1.018827e-03  1.942471e-03 -2.651094e-04  2.172175e-03 -1.336380e-03  1.045057e-04 
   away_fouls     home_poss     away_poss   home_shoton   away_shoton  home_shotoff  away_shotoff 
-1.521606e-04 -2.336063e-05  3.514283e-05 -1.683637e-04  2.079877e-04  1.375954e-04 -2.421452e-04 
 home_corners  away_corners 
-2.557286e-05  1.752649e-04 
SUMMARY

It was found that parameter values were all extremely likely based on the 5000 repetions of nonparametric bootstrapping based on the histograms below, where the red line represents the found parameter value for the model.

for(i in 1:length(results$t0)){
  hist(results$t[,i], 
       prob = TRUE,
       main = colnames(df)[i])
  abline(v = coefficients(model2)[i], col = 'red')#
}

The fitted model (model 2 without intercept) showed well-behaved residuals and significance on 10/16 parameters

plot(model2$residuals)

summary(model2)

Call:
lm(formula = goal_diff ~ . - 1, data = df)

Residuals:
    Min      1Q  Median      3Q     Max 
-6.7571 -1.0326 -0.0055  0.9821  7.7238 

Coefficients:
               Estimate Std. Error t value Pr(>|t|)    
home_y1      -0.2112893  0.0285379  -7.404 1.71e-13 ***
home_y2      -0.8668292  0.2043133  -4.243 2.28e-05 ***
home_r       -1.0731757  0.1484374  -7.230 6.10e-13 ***
away_y1       0.0124134  0.0263099   0.472 0.637092    
away_y2       0.5356928  0.1447810   3.700 0.000219 ***
away_r        0.9033008  0.1321464   6.836 9.84e-12 ***
home_fouls    0.0131246  0.0093345   1.406 0.159820    
away_fouls   -0.0194850  0.0091101  -2.139 0.032528 *  
home_poss     0.0411316  0.0027986  14.697  < 2e-16 ***
away_poss    -0.0193323  0.0028321  -6.826 1.05e-11 ***
home_shoton  -0.0041805  0.0100472  -0.416 0.677375    
away_shoton  -0.0006627  0.0118894  -0.056 0.955556    
home_shotoff -0.0291513  0.0109704  -2.657 0.007919 ** 
away_shotoff -0.0142992  0.0125488  -1.139 0.254593    
home_corners -0.0280717  0.0114633  -2.449 0.014388 *  
away_corners  0.0030107  0.0131407   0.229 0.818795    
---
Signif. codes:  0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1

Residual standard error: 1.663 on 3024 degrees of freedom
Multiple R-squared:  0.1798,    Adjusted R-squared:  0.1754 
F-statistic: 41.43 on 16 and 3024 DF,  p-value: < 2.2e-16

The model is of the form goal_difference = beta1x1 + beta2x2 …,

where goal_difference = home_goal - away_goal

LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKTGluZWFyIFJlZ3Jlc3Npb24gTW9kZWxsaW5nIGZvciBTb2NjZXIgRGF0YQotLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tLS0tCgpMb2FkIHBhY2thZ2VzIGFuZCBzZXQgZGlyZWN0b3JpZXMKYGBge3J9CmxpYnJhcnkoYm9vdCkKbGlicmFyeSh0aWR5dmVyc2UpCmxpYnJhcnkoYnJvb20pCnNldHdkKCd+L0xpYnJhcnkvTW9iaWxlIERvY3VtZW50cy9jb21+YXBwbGV+Q2xvdWREb2NzL1VkdmVrc2xpbmcvU3R1ZGllL0xTIDg4IC0gMy9Qcm9qZWN0L2xzODhfcHJvamVjdC1tYXN0ZXInKQpgYGAKCgpMb2FkIGRhdGEKYGBge3J9CmRmID0gcmVhZF9jc3YoJ2RhdGFmcmFtZS5jc3YnKQpgYGAKCgpSZXRhaW4gb25seSBub24tbm9taW5hbCBmZWF0dXJlcwpgYGB7cn0KZGYgPC0gZGYgJT4lIAogICAgICBzZWxlY3QoLVgxLCAtaWQsIC1jb3VudHJ5X2lkLCAtbGVhZ3VlX2lkLCAtc3RhZ2UsIC1kYXRlLCAtbWF0Y2hfYXBpX2lkLCAtaG9tZV90ZWFtX2FwaV9pZCwKICAgICAgICAgICAgLWF3YXlfdGVhbV9hcGlfaWQsIC1ob21lX3RlYW1fbmFtZSwgLWF3YXlfdGVhbV9uYW1lLCAtc2Vhc29uKQpgYGAKCgoKQWRkIEdvYWwgRGlmZmVyZW5jZSBhcyB0YXJnZXQgdmFyaWFibGUgYW5kIHByaW50IGRpc3RyaWJ1dGlvbgpgYGB7cn0KZGYgPC0gZGYgJT4lCiAgICAgIG11dGF0ZShnb2FsX2RpZmYgPSBob21lX3RlYW1fZ29hbCAtIGF3YXlfdGVhbV9nb2FsKQoKaGlzdChkZiRnb2FsX2RpZmYpCmBgYApMb29rcyBwZXJmZWN0bHkgbm9ybWFseSBkaXN0cmlidXRlZC4KCkFzIHRoZSB0YXJnZXQgdmFyaWFibGUgaXMgZGVyaXZlZCBmcm9tIGhvbWVfdGVhbV9nb2FsIGFuZCBhd2F5X3RlYW1fZ29hbCB3ZSBkcm9wIHRoZXNlIHZhcmlhYmxlcy4KYGBge3J9CmRmIDwtIGRmICU+JSAKICAgICAgc2VsZWN0KC1ob21lX3RlYW1fZ29hbCwgLWF3YXlfdGVhbV9nb2FsKQpgYGAKCgoKQmFzZWQgb24gdGhlIGNvcnJlbGF0aW9uIG1hdHJpeCBmcm9tIHRoZSBKdXB5dGVyIE5vdGVib29rLCB3ZSB3aWxsIGRyb3AgdmFyaWFibGVzOgpob21lX3BvaW50cwphd2F5X3BvaW50cwoKQXMgdGhlc2UgMiB2YXJpYWJsZXMgaGF2ZSB0b28gbXVjaCBjb3JyZWxhdGlvbiB3aXRoIHRoZSByZXN0IG9mIHRoZSB2YXJpYWJsZXMgYW5kIGFyZSBpbnRlcmRlcGVuZGVudC4KYGBge3J9CmRmIDwtIGRmICU+JSAKICAgICAgc2VsZWN0KC1ob21lX3BvaW50cywgLWF3YXlfcG9pbnRzKQpgYGAKCgpXZSBzY2FsZSB0aGUgdmFyaWFibGVzIHdpdGhvdXQgY2VudGVyaW5nIHRvIG5vdCBoYXZlIGNlcnRhaW4gdmFyaWFibGVzIGRvbWluYXRlIHRvbyBtdWNoCmBgYHtyfQpkZl9zY2FsZWQgPC0gYXMuZGF0YS5mcmFtZShzY2FsZShkZikpCmBgYAoKRml0dGluZyBhIGxpbmVhciBtb2RlbCAoaXRlcmF0aW9uIDEpCldlIGhhdmU6ClJlc3BvbnNlIHZhcmlhYmxlOiBHb2FsIERpZmZlcmVuY2UKRXhwbGFuYXRvcnkgdmFyaWFibGVzOiBBbGwgT3RoZXJzICsgSW50ZXJjZXB0CmBgYHtyfQptb2RlbDEgPC0gbG0oZ29hbF9kaWZmIH4gLiwKICAgICAgICAgICAgICBkYXRhID0gZGYpCiNQcmludCBTdW1tYXJ5CnByaW50KHN1bW1hcnkobW9kZWwxKSkKYGBgCgpQbG90IE1vZGVsMQpgYGB7cn0KI1Bsb3QgcmVzaWR1YWxzCnBsb3QobW9kZWwxJHJlc2lkdWFscykKYGBgClJlc2lkdWFscyBsb29rIGdyZWF0LiAoY2VudGVyZWQgYXJvdW5kIDAgd2l0aCBTRCBhcHByb3hpbWF0ZSB0byAxKS4KClRoZSBpbnRlcmNlcHQgaXMgaW5zaWduaWZpY2FudC4gV2lsbCBiZSByZW1vdmVkLgpNb2RlbCAyClJlbW92aW5nIGludGVyY2VwdC4KYGBge3J9Cm1vZGVsMiA8LSBsbShnb2FsX2RpZmYgfiAuIC0xLAogICAgICAgICAgICAgIGRhdGEgPSBkZikKCiNQbG90IHJlc2lkdWFscwpwbG90KG1vZGVsMiRyZXNpZHVhbHMpCgojUHJpbnQgU3VtbWFyeQpwcmludChzdW1tYXJ5KG1vZGVsMikpCmBgYApOb3RpY2UgaG93IGhvbWVfc2hvdG9mZiBpcyBzaWduaWZpY2FudCB3aXRoIGEgbmVnYXRpdmUgZWZmZWN0IG9uIHRoZSBnb2FsIGRpZmZlcmVuY2UgKHRlYW1zIHdobyBtaXNzIGEgbG90IG9mIHNob3RzIGF0IGhvbWUgd2lsbCBhY2hpZXZlIGEgd29yc2UgZ29hbCBkaWZmZXJlbmNlKS4gSXQgaXMgYWxzbyBub3RpY2VhYmxlIGhvdyBnZXR0aW5nIGEgbG90IG9mIGNvcm5lcnMgb24geW91ciBob21lIHR1cmYgZGVjcmVhc2VzIHlvdXIgZ29hbCBkaWZmZXJlbmNlLgpTdXJwcmlzaW5nbHksIHNob3RzIG9uIGdvYWwgZG8gbm90IHNlZW0gdG8gaGF2ZSBhIHNpZ25pZmljYW50IGVmZmVjdC4KCi0tLS0tLS0tLS0tLS0tLS0tLS0KQk9PVFNUUkFQUElORwotLS0tLS0tLS0tLS0tLS0tLS0tCgpOb24tcGFyYW1ldHJpYyBib290c3RyYXBwaW5nIGZvciB0aGUgY29lZmZpZWNlbnRzIHdpdGggNTAwMCBtb2RlbHMuCldlIGRvIG5vdCBrbm93IHRoZSB0cnVlIGRpc3RyaWJ1dGlvbiBhbmQgdXNlIG5vbnBhcmFtZXRyaWMgYm9vdHN0cmFwIHNhbXBsaW5nIHdpdGggcmVwbGFjZW1lbnQgYWNjb3JkaW5nbHkuClRoaXMgd2lsbCB1c2UgdGhlIGVtcGlyaWNhbCBkaXN0cmlidXRpb24uCgpgYGB7cn0KIyMjQ29kZSBjb3BpZWQgZnJvbSBzdGF0bWV0aG9kcy5uZXQKI0Z1bmN0aW9uIHRvIG9iYXRpbiByZWdyZXNzaW9uIHdlaWdodHMKYnMgPC0gZnVuY3Rpb24oZm9ybXVsYSwgZGF0YSwgaW5kaWNlcyl7CiAgZCA8LSBkYXRhW2luZGljZXMsIF0gI2FsbG93cyBib290IHRvIHNlbGVjdCBzYW1wbGUKICBmaXQgPC0gbG0oZm9ybXVsYSwgZGF0YSA9IGQpCiAgcmV0dXJuKGNvZWYoZml0KSkKfQoKCiNDcmVhdGluZyBvd24gYm9vc3RyYXAgbWV0aG9kCmJvb3RzdHJhcCA8LSBmdW5jdGlvbihkYXRhLCBmb3JtdWxhLCBrKXsKICByZXR1cm4ubWF0cml4IDwtIG1hdHJpeChucm93ID0gaywgbmNvbCA9IDE2KQogIAogIHNldC5zZWVkKDIwMCkKICBmb3IoaSBpbiAxOmspewogICAgc2FtcGxlX3RlbXAgPC0gc2FtcGxlX24oZGF0YSwgc2l6ZSA9IG5yb3coZGF0YSksIHJlcGxhY2UgPSBUUlVFKQogICAgbW9kZWxfdGVtcCA8LSBsbShzYW1wbGVfdGVtcCwKICAgICAgICAgICAgICAgICAgIGZvcm11bGEgPSBmb3JtdWxhKQogICAgcmV0dXJuLm1hdHJpeFtpLCBdIDwtIGNvZWZmaWNpZW50cyhtb2RlbF90ZW1wKQogIH0KICByZXR1cm4ocmV0dXJuLm1hdHJpeCkKfQoKI3Jlc3VsdHNfb3duIDwtIGJvb3RzdHJhcChkZl9zY2FsZWQsIGdvYWxfZGlmZiB+LiAtIDEsIDUwMDApCiNCb290c3RyYXBwaW5nIHdpdGggNTAwMCByZXBsaWNhdGlvbnMKcmVzdWx0cyA8LSBib290KGRmLCAKICAgICAgICAgICAgICAgIHN0YXRpc3RpYyA9IGJzLAogICAgICAgICAgICAgICAgUiA9IDUwMDAsCiAgICAgICAgICAgICAgICBmb3JtdWxhID0gZ29hbF9kaWZmIH4gLiAtIDEpCgpgYGAKClBsb3QgZGlzdHJpYnV0aW9ucyBvZiBwYXJhbWV0ZXJzCmBgYHtyfQpmb3IoaSBpbiAxOmxlbmd0aChyZXN1bHRzJHQwKSl7CiAgaGlzdChyZXN1bHRzJHRbLGldLCAKICAgICAgIHByb2IgPSBUUlVFLAogICAgICAgbWFpbiA9IGNvbG5hbWVzKGRmKVtpXSkKICBhYmxpbmUodiA9IGNvZWZmaWNpZW50cyhtb2RlbDIpW2ldLCBjb2wgPSAncmVkJykjCn0KYGBgCmBgYHtyfQpjb2VmZmljaWVudHMobW9kZWwyKSAtIGFwcGx5KHJlc3VsdHMkdCwgMiwgbWVhbikKYGBgCgoKLS0tLS0tLS0tLS0tLS0tLS0KU1VNTUFSWQotLS0tLS0tLS0tLS0tLS0tLQoKSXQgd2FzIGZvdW5kIHRoYXQgcGFyYW1ldGVyIHZhbHVlcyB3ZXJlIGFsbCBleHRyZW1lbHkgbGlrZWx5IGJhc2VkIG9uIHRoZSA1MDAwIHJlcGV0aW9ucyBvZiBub25wYXJhbWV0cmljIGJvb3RzdHJhcHBpbmcgYmFzZWQgb24gdGhlIGhpc3RvZ3JhbXMgYmVsb3csIHdoZXJlIHRoZSByZWQgbGluZSByZXByZXNlbnRzIHRoZSBmb3VuZCBwYXJhbWV0ZXIgdmFsdWUgCmZvciB0aGUgbW9kZWwuCmBgYHtyfQpmb3IoaSBpbiAxOmxlbmd0aChyZXN1bHRzJHQwKSl7CiAgaGlzdChyZXN1bHRzJHRbLGldLCAKICAgICAgIHByb2IgPSBUUlVFLAogICAgICAgbWFpbiA9IGNvbG5hbWVzKGRmKVtpXSkKICBhYmxpbmUodiA9IGNvZWZmaWNpZW50cyhtb2RlbDIpW2ldLCBjb2wgPSAncmVkJykjCn0KYGBgCgoKVGhlIGZpdHRlZCBtb2RlbCAobW9kZWwgMiB3aXRob3V0IGludGVyY2VwdCkgc2hvd2VkIHdlbGwtYmVoYXZlZCByZXNpZHVhbHMgCmFuZCBzaWduaWZpY2FuY2Ugb24gMTAvMTYgcGFyYW1ldGVycwpgYGB7cn0KcGxvdChtb2RlbDIkcmVzaWR1YWxzKQpgYGAKYGBge3J9CnN1bW1hcnkobW9kZWwyKQpgYGAKClRoZSBtb2RlbCBpcyBvZiB0aGUgZm9ybQpnb2FsX2RpZmZlcmVuY2UgPSBiZXRhMSp4MSArIGJldGEyKngyIC4uLiwKCndoZXJlIGdvYWxfZGlmZmVyZW5jZSA9IGhvbWVfZ29hbCAtIGF3YXlfZ29hbA==